In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
# to visulize the weekdays
import plotly.express as px
#shift +tab is used to get the function details and auto sugesstion
In [1]:
import seaborn as sns
In [11]:
files = os.listdir(r"G:\Tutorials Notes\UdemyCourses\DataAnalysis_Projects\Uber_NewYork_DA\DataFiles\drive-download-20210609T171232Z-001")
#r is to show the exact address of the data location

# for the last 7 files only
files = files[-7:]
files
Out[11]:
['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-janjune-15.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [12]:
# have to remove a file,list
files.remove('uber-raw-data-janjune-15.csv')
files
Out[12]:
['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [ ]:
 
In [13]:
# now to concatinate the data of multiple files

path = r'G:\Tutorials Notes\UdemyCourses\DataAnalysis_Projects\Uber_NewYork_DA\DataFiles\drive-download-20210609T171232Z-001'

final = pd.DataFrame()
for file in files:
    df = pd.read_csv(path+ "/" + file, encoding = 'utf-8')
    final = pd.concat([final, df])
    
In [14]:
final.shape
Out[14]:
(4534327, 4)
In [18]:
df = final
df.head()
Out[18]:
Date/Time Lat Lon Base
0 4/1/2014 0:11:00 40.7690 -73.9549 B02512
1 4/1/2014 0:17:00 40.7267 -74.0345 B02512
2 4/1/2014 0:21:00 40.7316 -73.9873 B02512
3 4/1/2014 0:28:00 40.7588 -73.9776 B02512
4 4/1/2014 0:33:00 40.7594 -73.9722 B02512
In [19]:
df.dtypes
Out[19]:
Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object
In [25]:
df['Date/Time'].head()
Out[25]:
0    4/1/2014 0:11:00
1    4/1/2014 0:17:00
2    4/1/2014 0:21:00
3    4/1/2014 0:28:00
4    4/1/2014 0:33:00
Name: Date/Time, dtype: object
In [30]:
#have to convert the first coloum into date time object

df['Date/Time'] = pd.to_datetime(df['Date/Time'],format = '%m/%d/%Y %H:%M:%S')

df.dtypes
Out[30]:
Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object
In [32]:
df.head()
Out[32]:
Date/Time Lat Lon Base
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512
In [33]:
df['day'] = df['Date/Time'].dt.day
df['minute'] = df['Date/Time'].dt.minute
df['month'] = df['Date/Time'].dt.month
df['year'] = df['Date/Time'].dt.year
Out[33]:
Date/Time Lat Lon Base day minute month year
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 11 4 2014
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 17 4 2014
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 21 4 2014
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 28 4 2014
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 33 4 2014
In [35]:
df['hour'] = df['Date/Time'].dt.hour
In [34]:
df['weekday'] = df['Date/Time'].dt.day_name()
Out[34]:
Date/Time Lat Lon Base day minute month year weekday
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 11 4 2014 Tuesday
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 17 4 2014 Tuesday
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 21 4 2014 Tuesday
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 28 4 2014 Tuesday
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 33 4 2014 Tuesday
In [36]:
df.head()
Out[36]:
Date/Time Lat Lon Base day minute month year weekday hour
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 11 4 2014 Tuesday 0
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 17 4 2014 Tuesday 0
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 21 4 2014 Tuesday 0
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 28 4 2014 Tuesday 0
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 33 4 2014 Tuesday 0
In [ ]:
 
In [37]:
# to see the counts 
df['weekday'].value_counts()
Out[37]:
Thursday     755145
Friday       741139
Wednesday    696488
Tuesday      663789
Saturday     646114
Monday       541472
Sunday       490180
Name: weekday, dtype: int64
In [39]:
 
In [40]:
#bar bar plot 
px.bar(x=df['weekday'].value_counts().index ,y = df['weekday'].value_counts())

# so rush is higher on thursday and friday
In [ ]:
 
In [41]:
# to plot the histogram
plt.hist(df['hour'])

# it means during the eveing time , they get most rides
Out[41]:
(array([216928., 103517., 227152., 543565., 324851., 366329., 819491.,
        660869., 579117., 692508.]),
 array([ 0. ,  2.3,  4.6,  6.9,  9.2, 11.5, 13.8, 16.1, 18.4, 20.7, 23. ]),
 <BarContainer object of 10 artists>)
In [ ]:
 
In [43]:
# tpo view the rides for each unique months
df['month'].unique()
Out[43]:
array([4, 8, 7, 6, 5, 9], dtype=int64)
In [48]:
# to iterate and vizulise for each month, we use the subplot

for i,month in enumerate(df['month'].unique()):
    print(i)
    print(month)
0
4
1
8
2
7
3
6
4
5
5
9
In [50]:
# to plot for each month
plt.figure(figsize = (40,20))

for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1)
    df[df['month']==month]['hour'].hist()
    
#by this we can see that for each month uber get the max rides on evening hours    
    
In [ ]:
 
In [52]:
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot
In [ ]:
 
In [53]:
# to visulize in a much better way
df.groupby('month')['hour'].count()
Out[53]:
month
4     564516
5     652435
6     663844
7     796121
8     829275
9    1028136
Name: hour, dtype: int64
In [56]:
#Q = which month has maximum rides
trace1 = go.Bar(x = df.groupby('month')['hour'].count().index, y = df.groupby('month')['hour'].count(), name = 'Priority')

iplot([trace1])
# So September has the maximum rrides
In [59]:
sns.displot(df['day'])
Out[59]:
<seaborn.axisgrid.FacetGrid at 0x220e76f3df0>
In [61]:
# Analysis of Journer on each day
plt.figure(figsize=(10,8))
plt.hist(df['day'], bins = 30, rwidth = 0.8, range= (0.5, 30.5))  # adding range makes the difference
plt.xlabel('date of the months')
plt.ylabel('Total journey')
plt.title('Journey BY Month Day')
Out[61]:
Text(0.5, 1.0, 'Journey BY Month Day')
In [ ]:
 
In [62]:
#@ Analysis of total rides month wise
plt.figure(figsize = (20,8))

for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out = df[df['month']==month]
    plt.hist(df_out['day'])
    plt.xlabel('days in month  {}'.format(month))
    plt.ylabel('total_rides')
In [ ]:
 
In [65]:
#Q-  Analysing rush in hours

# simplar to the line plot
ax = sns.pointplot(x = 'hour', y='Lat',data = df, hue = 'weekday')
ax.set_title('HoursOfDay vs Latitue of Passenger')
Out[65]:
Text(0.5, 1.0, 'HoursOfDay vs Latitue of Passenger')
In [ ]:
 
In [66]:
#Q =  analysis which base number gets popular by month  name
df.head()
Out[66]:
Date/Time Lat Lon Base day minute month year weekday hour
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 11 4 2014 Tuesday 0
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 17 4 2014 Tuesday 0
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 21 4 2014 Tuesday 0
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 28 4 2014 Tuesday 0
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 33 4 2014 Tuesday 0
In [67]:
base = df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base
Out[67]:
Base month Date/Time
0 B02512 4 35536
1 B02512 5 36765
2 B02512 6 32509
3 B02512 7 35021
4 B02512 8 31472
5 B02512 9 34370
6 B02598 4 183263
7 B02598 5 260549
8 B02598 6 242975
9 B02598 7 245597
10 B02598 8 220129
11 B02598 9 240600
12 B02617 4 108001
13 B02617 5 122734
14 B02617 6 184460
15 B02617 7 310160
16 B02617 8 355803
17 B02617 9 377695
18 B02682 4 227808
19 B02682 5 222883
20 B02682 6 194926
21 B02682 7 196754
22 B02682 8 173280
23 B02682 9 197138
24 B02764 4 9908
25 B02764 5 9504
26 B02764 6 8974
27 B02764 7 8589
28 B02764 8 48591
29 B02764 9 178333
In [70]:
plt.figure(figsize=(20,6))
sns.lineplot(x='month',y='Date/Time',hue='Base', data = base)
Out[70]:
<AxesSubplot:xlabel='month', ylabel='Date/Time'>
In [ ]:
 
In [ ]:
#Q performe cross analysis 
#1= HeatMap by hour and weekday
#2 = HeatMap by hour and day
#3 = HeatMap by Month and day
#4 = HeatMap by Month and weekday
In [71]:
def count_rows(rows):
    return len(rows)
In [85]:
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross
Out[85]:
weekday    hour
Friday     0       13716
           1        8163
           2        5350
           3        6930
           4        8806
                   ...  
Wednesday  19      47017
           20      47772
           21      44553
           22      32868
           23      18146
Length: 168, dtype: int64
In [86]:
pivot = by_cross.unstack()
pivot
Out[86]:
hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
weekday
Friday 13716 8163 5350 6930 8806 13450 23412 32061 31509 25230 ... 36206 43673 48169 51961 54762 49595 43542 48323 49409 41260
Monday 6436 3737 2938 6232 9640 15032 23746 31159 29265 22197 ... 28157 32744 38770 42023 37000 34159 32849 28925 20158 11811
Saturday 27633 19189 12710 9542 6846 7084 8579 11014 14411 17669 ... 31418 38769 43512 42844 45883 41098 38714 43826 47951 43174
Sunday 32877 23015 15436 10597 6374 6169 6596 8728 12128 16401 ... 28151 31112 33038 31521 28291 25948 25076 23967 19566 12166
Thursday 9293 5290 3719 5637 8505 14169 27065 37038 35431 27812 ... 36699 44442 50560 56704 55825 51907 51990 51953 44194 27764
Tuesday 6237 3509 2571 4494 7548 14241 26872 36599 33934 25023 ... 34846 41338 48667 55500 50186 44789 44661 39913 27712 14869
Wednesday 7644 4324 3141 4855 7511 13794 26943 36495 33826 25635 ... 35148 43388 50684 55637 52732 47017 47772 44553 32868 18146

7 rows × 24 columns

In [ ]:
 
In [89]:
plt.figure(figsize=(15,6))
sns.heatmap(pivot)
Out[89]:
<AxesSubplot:xlabel='hour', ylabel='weekday'>
In [90]:
def heatmap(col1,col2):
    by_cross = df.groupby([col1,col2]).apply(count_rows)
    pivot = by_cross.unstack()
    plt.figure(figsize=(15,6))
    return sns.heatmap(pivot)
    
    
In [91]:
heatmap('day','hour')
Out[91]:
<AxesSubplot:xlabel='hour', ylabel='day'>
In [96]:
plt.figure(figsize = (12,6))
plt.plot(df['Lon'],df['Lat'],'r+',ms = 0.5)
plt.xlim(-74.2,-73.7)
plt.ylim(40.6,41)
Out[96]:
(40.6, 41.0)
In [ ]:
 
In [97]:
df_out = df[df['weekday'] =='Sunday']
df
Out[97]:
Date/Time Lat Lon Base day minute month year weekday hour
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 11 4 2014 Tuesday 0
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 17 4 2014 Tuesday 0
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 21 4 2014 Tuesday 0
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 28 4 2014 Tuesday 0
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 33 4 2014 Tuesday 0
... ... ... ... ... ... ... ... ... ... ...
1028131 2014-09-30 22:57:00 40.7668 -73.9845 B02764 30 57 9 2014 Tuesday 22
1028132 2014-09-30 22:57:00 40.6911 -74.1773 B02764 30 57 9 2014 Tuesday 22
1028133 2014-09-30 22:58:00 40.8519 -73.9319 B02764 30 58 9 2014 Tuesday 22
1028134 2014-09-30 22:58:00 40.7081 -74.0066 B02764 30 58 9 2014 Tuesday 22
1028135 2014-09-30 22:58:00 40.7140 -73.9496 B02764 30 58 9 2014 Tuesday 22

4534327 rows × 10 columns

In [98]:
df_out.shape
Out[98]:
(490180, 10)
In [99]:
df_out.head()
Out[99]:
Date/Time Lat Lon Base day minute month year weekday hour
6965 2014-04-06 00:00:00 40.6547 -74.3033 B02512 6 0 4 2014 Sunday 0
6966 2014-04-06 00:00:00 40.7356 -74.0006 B02512 6 0 4 2014 Sunday 0
6967 2014-04-06 00:00:00 40.7421 -74.0041 B02512 6 0 4 2014 Sunday 0
6968 2014-04-06 00:00:00 40.7401 -74.0053 B02512 6 0 4 2014 Sunday 0
6969 2014-04-06 00:01:00 40.7368 -73.9877 B02512 6 1 4 2014 Sunday 0
In [104]:
rush = df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
rush
Out[104]:
Lat Lon weekday
0 39.9374 -74.0722 1
1 39.9378 -74.0721 1
2 39.9384 -74.0742 1
3 39.9385 -74.0734 1
4 39.9415 -74.0736 1
... ... ... ...
209225 41.3141 -74.1249 1
209226 41.3180 -74.1298 1
209227 41.3195 -73.6905 1
209228 41.3197 -73.6903 1
209229 42.1166 -72.0666 1

209230 rows × 3 columns

In [105]:
rush.coloumns = ['Lat','Lon','no of trips']
rush
<ipython-input-105-ba1cc865827f>:1: UserWarning:

Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access

Out[105]:
Lat Lon weekday
0 39.9374 -74.0722 1
1 39.9378 -74.0721 1
2 39.9384 -74.0742 1
3 39.9385 -74.0734 1
4 39.9415 -74.0736 1
... ... ... ...
209225 41.3141 -74.1249 1
209226 41.3180 -74.1298 1
209227 41.3195 -73.6905 1
209228 41.3197 -73.6903 1
209229 42.1166 -72.0666 1

209230 rows × 3 columns

In [106]:
 
ERROR: Could not find a version that satisfies the requirement foilum (from versions: none)
ERROR: No matching distribution found for foilum
In [107]:
import folium
from folium.plugins import HeatMap
In [109]:
baseMap = folium.Map()
baseMap
Out[109]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [112]:
HeatMap(rush, zoom = 20, radius = 15).add_to(baseMap)

baseMap
Out[112]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
 
In [113]:
def plot(df,day):
    baseMap = folium.Map()
    df_out = df[df['weekday'] ==day]
    rush = df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
    HeatMap(rush, zoom = 20, radius = 15).add_to(baseMap)
    return baseMap
In [114]:
plot(df,'Sunday')
Out[114]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
 
In [115]:
# Data Prepartions
uber_15 = pd.read_csv(r'G:\Tutorials Notes\UdemyCourses\DataAnalysis_Projects\Uber_NewYork_DA\DataFiles\drive-download-20210609T171232Z-001\uber-raw-data-janjune-15.csv')
In [116]:
uber_15.head()
Out[116]:
Dispatching_base_num Pickup_date Affiliated_base_num locationID
0 B02617 2015-05-17 09:47:00 B02617 141
1 B02617 2015-05-17 09:47:00 B02617 65
2 B02617 2015-05-17 09:47:00 B02617 100
3 B02617 2015-05-17 09:47:00 B02774 80
4 B02617 2015-05-17 09:47:00 B02617 90
In [117]:
uber_15.dtypes
Out[117]:
Dispatching_base_num    object
Pickup_date             object
Affiliated_base_num     object
locationID               int64
dtype: object
In [118]:
uber_15['Pickup_date'] =pd.to_datetime(uber_15['Pickup_date'],format = '%Y-%m-%d %H:%M:%S')
In [119]:
uber_15.dtypes
Out[119]:
Dispatching_base_num            object
Pickup_date             datetime64[ns]
Affiliated_base_num             object
locationID                       int64
dtype: object
In [120]:
uber_15['day'] = uber_15['Pickup_date'].dt.day
uber_15['minute'] = uber_15['Pickup_date'].dt.minute
uber_15['month'] = uber_15['Pickup_date'].dt.month
uber_15['year'] = uber_15['Pickup_date'].dt.year
uber_15['hour'] = uber_15['Pickup_date'].dt.hour
uber_15['weekday'] = uber_15['Pickup_date'].dt.day_name()
In [121]:
uber_15.head()
Out[121]:
Dispatching_base_num Pickup_date Affiliated_base_num locationID day minute month year hour weekday
0 B02617 2015-05-17 09:47:00 B02617 141 17 47 5 2015 9 Sunday
1 B02617 2015-05-17 09:47:00 B02617 65 17 47 5 2015 9 Sunday
2 B02617 2015-05-17 09:47:00 B02617 100 17 47 5 2015 9 Sunday
3 B02617 2015-05-17 09:47:00 B02774 80 17 47 5 2015 9 Sunday
4 B02617 2015-05-17 09:47:00 B02617 90 17 47 5 2015 9 Sunday
In [124]:
uber_15['month'].value_counts()
Out[124]:
6    2816895
5    2695553
4    2280837
2    2263620
3    2259773
1    1953801
Name: month, dtype: int64
In [123]:
#Q = uber pickup by month in NYC
px.bar(x = uber_15['month'].value_counts().index,
      y = uber_15['month'].value_counts())
In [ ]:
 
In [125]:
#Q =  Analysis rush in each hour in NYCC

plt.figure(figsize = (12,6))
sns.countplot(uber_15['hour'])
C:\Users\Gagan Shrivastava\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[125]:
<AxesSubplot:xlabel='hour', ylabel='count'>
In [ ]:
 
In [126]:
#q =  anaysissis of rush on the basiosc of days and hours 

summary = uber_15.groupby(['weekday','hour'])['Pickup_date'].count().reset_index()
summary.head()
Out[126]:
weekday hour Pickup_date
0 Friday 0 85939
1 Friday 1 46616
2 Friday 2 28102
3 Friday 3 19518
4 Friday 4 23575
In [129]:
plt.figure(figsize = (12,8))
sns.pointplot(x = 'hour',y = 'Pickup_date',hue = 'weekday', data = summary)
Out[129]:
<AxesSubplot:xlabel='hour', ylabel='Pickup_date'>
In [ ]: